## Example code for running the Nielsen Stemmer for Arabic ## Rich Nielsen ## nielsen.rich@gmail.com ## If you use this stemmer, please cite: ## Nielsen, Richard A. 2017. Deadly Clerics: Blocked Ambition and the Paths to Jihad. Cambridge University Press. ## install the stemmer install.packages("arabicStemR") ## load the package library(arabicStemR) ## bring in some text (I'm grabbing the current front page of Aljazeera) dat <- paste(readLines("http://aljazeera.net/portal",encoding="UTF-8"), collapse=" ") ## remove the html dat <- gsub("<.*?>", "",dat) ## stem and transliterate the results stem(dat) ## stem and return the stemlist out <- stem(dat,returnStemList=T) out$text out$stemlist ## This allows you to see which words are being combined ## Interpret this as follows: i <- 1 ## This is the i'th stem in quotes (with the original word as the label) out$stemlist[i] ## These are all the words that resolve to the same stem. names(out$stemlist)[out$stemlist==out$stemlist[i]] ## And this will provide a count. mytab <- table(names(out$stemlist)[out$stemlist==out$stemlist[i]]) for(i in 1:length(mytab)){print(mytab[i])} ## Note that if you just look at "mytab", it will appear incorrect because ## R displays the Arabic labels from right to left but the numbers from left ## to right (thanks R!). ## This can be done for all of the stems result <- sapply(out$stemlist, function(x){table(names(out$stemlist)[out$stemlist==x])}) for(i in 1:length(result)){ cat(paste("stemmed:",out$stemlist[i],"\n")) cat("unstemmed:") print(result[[i]]) cat("\n") } ## display the results correctly for the i'th stem i <- 1 for(j in 1:length(result[[i]])){print(result[[i]][j])} ## Some options can be left out with arguments: cleaning Arabic characters, Latin characters, and transliteration: ## This version keeps Latin characters and does not transliterate, but does stem the Arabic. stem(dat, cleanChars=F, cleanLatinChars=F, transliteration=F) ## The stem function is just a wrapper for the sub-functions. stem ## This means that you can create your own custom stemming easily ## For example, if you want to not clean up diacritics: stemCustom <- function(dat, cleanChars=TRUE, cleanLatinChars=TRUE, transliteration=TRUE, returnStemList=FALSE){ dat <- removeNewlineChars(dat) ## gets rid of \n\r\t\f\v dat <- removePunctuation(dat) ## gets rid of punctuation ####dat <- removeDiacritics(dat) ## gets rid of Arabic diacritics dat <- removeEnglishNumbers(dat) ## gets rid of English numbers dat <- removeArabicNumbers(dat) ## gets rid of Arabic numbers dat <- removeFarsiNumbers(dat) ## gets rid of Farsi numbers dat <- fixAlifs(dat) ## standardizes different hamzas on alif seats if(cleanChars){dat <- cleanChars(dat)} ## removes all unicode chars except Latin chars and Arabic alphabet if(cleanLatinChars){dat <- cleanLatinChars(dat)} ## removes all Latin chars dat <- removeStopWords(dat)$text ## removes the stopwords if(returnStemList==TRUE){ tmp <- doStemming(dat) ## removes prefixes and suffixes, and can return a list matching words to stemmed words dat <- tmp$text stemlist <- tmp$stemmedWords if(transliteration){dat <- transliterate(dat)} ## performs transliteration return(list(text=dat,stemlist=stemlist)) } else { dat <- removePrefixes(dat) ## removes prefixes dat <- removeSuffixes(dat) ## removes suffixes if(transliteration){dat <- transliterate(dat)} ## performs transliteration return(dat) } } ## Some Arabic with diacritics x <- '\u0627\u0647\u0644\u0627\u064b \u0648\u0633\u0644\u0627\u064b' print(x) ## Stem it, retaining diacritics stemCustom(x, cleanChars=F, transliteration=F) ## Note that cleanChars must be set to FALSE for diacritics to be retained ## because cleanChars removes Arabic Diacritics as well. ## to see what stopwords are removed removeStopWords("a")$arabicStopwordList